Code
library(vegabrite)
library(dplyr)
library(tidyr)
library(jsonlite)
library(readxl)
library(lubridate)
library(mapview)
library(sf)
library(geojson)
library(geojsonsf)library(vegabrite)
library(dplyr)
library(tidyr)
library(jsonlite)
library(readxl)
library(lubridate)
library(mapview)
library(sf)
library(geojson)
library(geojsonsf)a. An examples with something I like
I like Daniel’s use of points for his graphic for comparing twins. I think the points make the whole graphic easier to look at and a person can perceive where there are differences between twins. Although he does not differentiate between kits, I like how there are not facets or a selector and I can look at the data all at once.
b. An example with something I don’t like
I don’t love Sucry’s graphic for comparing kits because genetic share seems to be treated as an ordinal categorical variable and I am not sure of his reasoning for doing so. I do not think it does the best at comparing kits because I am not sure what the graphic would look like if the kits all reported the same DNA proportions.
c-d. Graphics and stories
genetics_wide <- read.csv("https://calvin-data304.netlify.app/data/twins-genetics-wide.csv")
genetics_long <- read.csv("https://calvin-data304.netlify.app/data/twins-genetics-long.csv")Comparing twins
vl_chart() |>
vl_mark_line() |>
vl_encode_x("id:N") |>
vl_encode_y("genetic.share:Q") |>
vl_encode_color("pair:N") |>
vl_encode_detail("pair:N") |>
vl_encode_row("kit:N") |>
vl_encode_column("region:N") |>
vl_add_data(genetics_long) |>
vl_add_properties(height = 150, width = 200)Story..
Comparing DNA kits
vl_chart() |>
vl_mark_line() |>
vl_encode_x("kit:N", title = "Kit") |>
vl_encode_y("genetic.share:Q", title = "Genetic Share") |>
vl_encode_color("twin:N") |>
vl_scale_color(scheme = "paired") |>
vl_legend_color(title = "Twin") |>
vl_encode_detail("twin:N") |>
vl_encode_facet("region:N", columns = 3, title = "Region") |>
vl_add_properties(width = 200, height = 120) |>
vl_add_data(genetics_long)Story…
I chose challenge #2, re-making the customer touchpoints graphic.
Issues I noticed with this graphic:
Both the x and y axes could be improved. It is not easy to see which bar goes with each of the date labels. The labels also do not seem to be correct as 2018-08 comes after 2019-03. On the y axis, the number of significant digits is not consistent.
The emphasis on the total touchpoints as seen in the title and labels on the graphic is not the focus of the graphic itself and can not really be seen with the data given. We have information on the total number of touchpoints per customer each month, but not the total number of customers, so we cannot actually calculate/graph the total touchpoints. Therefore, with this data, the total touchpoints should probably not be the main message and if included, should probably just be in the caption or subtitle.
The title and touchpoint labels are also currently quite large and distracting.
Stacked bars are probably not the best option because it is hard to compare the amounts of different types of touchpoints within one month (although maybe this is not as important to the story). It is also hard to see the trends for chat and email touchpoints because they don’t have a common baseline.
touchpoints_wide <- read.csv("https://calvin-data304.netlify.app/data/swd-lets-practice-ex-5-03.csv") |>
mutate(Total = Phone.Touchpoints + Chat.Touchpoints + Email.Touchpoints,
Complete_Date_str = paste0(Date, "-01"),
Complete_Date = ymd(Complete_Date_str)) |>
rename(Phone = Phone.Touchpoints,
Chat = Chat.Touchpoints,
Email = Email.Touchpoints)
touchpoints_long <- touchpoints_wide |>
pivot_longer(Phone:Total, names_to = "Type", values_to = "Touchpoints")
touchpoints_label_data <- touchpoints_long |>
filter(Date == "2020-01")base <- vl_chart() |>
vl_encode_x("Complete_Date:T", title = "") |>
vl_axis_x(format = "%b %Y") |>
vl_encode_y("Touchpoints:Q", title = "Touchpoints per Customer") |>
vl_encode_color("Type:N", legend = FALSE) |>
vl_scale_color(domain = c("Chat", "Phone", "Email", "Total"),
range = c("#1b9e77", "#7570b3", "#d95f02", "black")) Warning: Invalid schema for object passed to or created by
modify_inner_spec.vegaspec_unit
lines <- base |>
vl_mark_line() |>
vl_add_data(touchpoints_long)
labels <- base |>
vl_mark_text(dx = 18) |>
vl_encode_text("Type:N") |>
vl_add_data(touchpoints_label_data)
vl_layer(lines, labels) |>
vl_add_properties(width = 400, height = 200)My data: Tanzania data
tanzania <- read_excel('Tanzania_data.xlsx') |>
mutate(contraception_use = contraception_use / 100,
fam_plan_unmet = fam_plan_unmet / 100)points_base <- vl_chart() |>
vl_encode_x("date:T", title = "") |>
vl_mark_point(strokeWidth = 1) |>
vl_encode_fill(value = "lightgrey") |>
vl_encode_opacity(value = 1) |>
vl_encode_color(value = "black") |>
vl_encode_size("fertility_rate:Q",
title = "Total Fertility Rate",
legend = list(orient = "bottom")) |>
vl_scale_size(domainMin = 5, domainMax = 7, rangeMax = 170)
contraception_points <- points_base |>
vl_encode_y("contraception_use:Q") |>
vl_axis_y(format = "%")
fam_plan_points <- points_base |>
vl_encode_y("fam_plan_unmet:Q")
contraception_line <- vl_chart() |>
vl_mark_line() |>
vl_encode_x("date:T", title = "") |>
vl_encode_y("contraception_use:Q", title = "") |>
vl_encode_color(value = "#7570b3")
fam_plan_line <- vl_chart() |>
vl_mark_line() |>
vl_encode_x("date:T", title = "") |>
vl_encode_y("fam_plan_unmet:Q", title = "") |>
vl_encode_color(value = "#1b9e77")
contraception_label <- vl_chart() |>
vl_mark_text(dx = 260, size = 12) |>
vl_encode_text(value = "Contraception Use") |>
vl_encode_y(datum = 0.384) |>
vl_encode_color(value = "#7570b3")
fam_plan_label <- vl_chart() |>
vl_mark_text(dx = 288, size = 12) |>
vl_encode_text(value = "Unmet Family Planning Need") |>
vl_encode_y(datum = 0.221) |>
vl_encode_color(value = "#1b9e77")
vl_layer(fam_plan_line, contraception_line,
fam_plan_points, contraception_points,
contraception_label, fam_plan_label) |>
vl_add_data(tanzania) |>
vl_encode_tooltip_array(c("date:T", "contraception_use:Q", "fam_plan_unmet:Q", "fertility_rate:Q")) |>
vl_add_properties(width = 400, height = 300,
title = "Contraception Use and Family Planning Increase in Tanzania")Over the last few decades, the use of modern contraception in Tanzania has steadily increased with the largest increase occurring after 2010. The unmet need for family planning has decreased slightly since 1992 by about 5% total. Potentially associated with both of these is the decrease in the total fertility rate for all women ages 15-49.
# Loading in data
us_map_url <- "https://cdn.jsdelivr.net/npm/vega-datasets@2.11.0/data/us-10m.json"
park_visits <- read_excel("National_Park_Data.xlsx",
skip = 2)
park_shp <- read_sf("nps_boundary_centroids.shp")
# Then I used st_write(park_shp, "parks.csv", layer_options = "GEOMETRY=AS_XY") in the console to create parks.csv
parks <- read.csv("parks.csv")# Cleaning and joining data
graphic_data <- parks |>
left_join(park_visits, join_by(UNIT_CODE == `Unit Code`),
relationship = "many-to-many") |>
filter(!is.na(`Recreation Visits`)) |>
select(X, Y, UNIT_CODE, UNIT_NAME, STATE, REGION, UNIT_TYPE, Year, `Recreation Visits`) |>
filter(STATE != "AS" & STATE != "DC" & STATE != "GU" & STATE != "MP" & STATE != "PR" &
STATE != "TT" & STATE != "VI") |>
rename(LAT = Y,
LONG = X,
Visits = `Recreation Visits`) |>
filter(UNIT_TYPE != "International Historic Site" & UNIT_TYPE != "Other Designation") |>
mutate(simpler_type = case_when(UNIT_TYPE %in% c("National River", "National Lakeshore",
"National Seashore", "National Wild & Scenic River") ~
"River, Lakeshore, Seashore",
UNIT_TYPE %in% c("National Historic Site", "National Historical Park") ~
"Historic Site or Park",
UNIT_TYPE == "National Scenic Trail" ~ "Scenic Trail",
UNIT_TYPE %in% c("National Battlefield", "National Battlefield Park",
"National Battlefield Site", "National Memorial",
"National Military Park") ~
"Memorial, Military Park, Battlefield",
UNIT_TYPE %in% c("National Preserve", "National Recreation Area",
"National Reserve") ~
"Reserve, Preserve, Recreation Area",
UNIT_TYPE == "National Monument" ~ "Monument",
TRUE ~ UNIT_TYPE))state_map <-
vl_chart() |>
vl_mark_geoshape(fill = "transparent", stroke = "grey", strokeWidth = 0.5) |>
vl_add_data(
url = us_map_url,
format = list(type = "topojson", feature = "states"))
park_points <-
vl_chart() |>
vl_filter("datum.Year == 2024") |>
vl_mark_point(strokeWidth = 0.3 ) |>
vl_encode(longitude = "LONG:Q", latitude = "LAT:Q") |>
vl_encode_color(value = "black", ) |>
vl_encode_fill("simpler_type:N", title = "") |>
vl_encode_size("Visits:Q") |>
vl_encode_opacity(value = 0.7) |>
vl_add_data(graphic_data)
vl_layer(state_map, park_points) |>
vl_add_properties(projection = list(type = "albersUSA"),
width = 450,
height = 300,
title = "Most Visited National Park Service Lands in 2024") #|> # vl_add_interval_selection(
# name = "panzoom",
# bind = "scales", encodings = c("longitude", "latitude"),
# resolve = "global",
# translate = TRUE,
# zoom = TRUE) Where I used these features:
An encoding channel other than x or y: Masterpiece
Layers: Tanzania Graphic
Facets: Comparing DNA Kits Graphic
Concatenation or repeat:
Non-default settings for a channel’s scale or guide: Tanzania Graphic
Tooltips: Tanzania Graphic
Another kind of interaction (panning/zooming, brushing, sliders, etc.):
a. Name 2 or 3 examples of where I used a feature we did not discuss in class (a new kind of mark, transform, a way to customize something, a way to use an interaction, etc)
b. Name 2 or 3 examples of where I followed the advice of Wilke or Knaflic